This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.
Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.
They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.
load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426 45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426 201
How does the individual MSE of genes varies with \(\alpha\) for model estimation in GRN inferences?
# ALPHAS <- seq(0,1, by = 0.1)
# lmses <- data.frame(row.names = genes)
#
# for(alpha in ALPHAS){
# set.seed(121314)
# lmses[,as.character(alpha)] <- bRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 1000,
# pwm_occurrence = pwm_occurrence, nCores = 18)
# }
#
# save(lmses, file = "results/logMSES_RFs_per_genes.rdata")
Clustering the genes :
load(file = "rdata/logMSES_RFs_per_genes_400trees.rdata")
mse <- (lmses-rowMeans(lmses)) / matrixStats::rowSds(as.matrix(lmses))
ha = HeatmapAnnotation(
alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
annotation_name_side = "left")
heatmap_rf_4000 <- Heatmap(mse, row_km = 3, cluster_columns = F, show_row_names = F,
name = "scaled logMSE (z-score)", top_annotation = ha,
row_title = "Genes", column_title = "alpha");heatmap_rf_4000
# lmses_lasso <- data.frame(row.names = genes)
#
# for(alpha in ALPHAS){
# set.seed(121314)
# lmses_lasso[,as.character(alpha)] <- LASSO.D3S_inference_MSE(counts, genes, tfs,
# normalized = TRUE,
# alpha = alpha, N = 50,
# pwm_occurrence = pwm_occurrence, nCores = 15)
# }
#
# save(lmses_lasso, file = "results/logMSES_Lasso_per_genes.rdata")
load(file = "results/logMSES_Lasso_per_genes.rdata")
mse_lasso <- (lmses_lasso-rowMeans(lmses_lasso)) / matrixStats::rowSds(as.matrix(lmses_lasso))
ha = HeatmapAnnotation(
alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
annotation_name_side = "left")
heatmap_lasso <- Heatmap(mse_lasso, row_km = 3, cluster_columns = F, show_row_names = F,
name = "scaled logMSE (z-score)", top_annotation = ha,
row_title = "Genes", column_title = "alpha"); heatmap_lasso
Il semblerait que les gènes dont la MSE diminue ne soient pas les mêmes suivant bRF et LASSO, seul un petit sous ensemble est commun:
hall = HeatmapAnnotation(
method = c(rep("bRF", 11), rep("lasso", 11)),
alpha = anno_simple(as.numeric(rep(colnames(mse),2))),
annotation_name_side = "left")
Heatmap(cbind(mse, mse_lasso), row_km = 6, cluster_columns = F, show_row_names = F,
name = "scaled logMSE", top_annotation = hall, clustering_distance_rows = "spearman",
row_title = "Genes")
On aimerait exclure ou en tout cas identifier dans ces graphiques s’il y a des gènes que l’on prédit mal quel que soit alpha.
En fait les gènes sur lesquels on se trompe le plus sont aussi les gènes les plus exprimés… Donc on va normaliser la MSE par la variance du gène cible pour avoir des MSE comparables entre gènes.
mean_expr <- rowMeans(counts)
var_expr <- matrixStats::rowSds(counts)*matrixStats::rowSds(counts)
all_mses <- cbind(lmses, lmses_lasso)
#all_mses_scaled <- (all_mses-rowMeans(all_mses)) / matrixStats::rowSds(as.matrix(all_mses))
all_mses_scaled <- exp(all_mses) / var_expr
min_mses <- matrixStats::rowMins(as.matrix(all_mses_scaled))
min_mses_lasso <- matrixStats::rowMins(as.matrix(exp(lmses_lasso)/var_expr))
min_mses_rf <- matrixStats::rowMins(as.matrix(exp(lmses)/var_expr))
hist(min_mses_lasso, breaks = 100)
hist(min_mses_rf, breaks = 100)
hist(min_mses, breaks = 100)
# df <- data.frame(mean_expr)
bad_genes_lasso <- genes[which(min_mses_lasso > 0.3)]
bad_genes_rf <- genes[which(min_mses_rf > 0.3)]
bad_genes_lasso <- min_mses_lasso > 0.3
bad_genes_rf <- min_mses_rf > 0.3
# df$bad <- rownames(df) %in% bad_genes
# df$min_lmse <- min_mses
# ggplot(df, aes(x=bad, y=mean_expr)) + geom_boxplot()
# ggplot(df, aes(x=log(mean_expr), y=min_lmse)) + geom_point()
#
mse_lasso_scaled <- exp(lmses_lasso)/var_expr
mse_rf_scaled <- exp(lmses)/var_expr
# mse_lasso_scaled[bad_genes_lasso>0.3,]<-NA
# mse_rf_scaled[bad_genes_rf>0.3,]<-NA
#
#
# ha = HeatmapAnnotation(
# alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
# annotation_name_side = "left")
#
# Heatmap(mse_lasso_scaled, row_km = 0, cluster_columns = F, show_row_names = F,
# name = "scaled logMSE (z-score)", top_annotation = ha,
# row_title = "Genes", column_title = "alpha") + rowAnnotation(
# scaled_min_mse = min_mses_lasso)
#
# Heatmap(mse_rf_scaled, row_km = 0, cluster_columns = F, show_row_names = F,
# name = "scaled logMSE (z-score)", top_annotation = ha,
# row_title = "Genes", column_title = "alpha")+ rowAnnotation(
# scaled_min_mse = min_mses_rf)
Sans normaliser par la MSE moyenne par gène, on ne voit plus les variations de MSE liées à alpha, on voit uniquement les gènes qui sont globalement bien ou mal prédits. Deux méthodes simultanément :
Heatmap(log(all_mses_scaled), row_km = 0, cluster_columns = F, show_row_names = F,
name = "MSE/Var", top_annotation = hall, col = rev(hcl.colors(5, palette = "Reds 2")),
row_title = "Genes")+ rowAnnotation(
scaled_min_mse = min_mses)
Je remets donc un z-score sur la MSE normalisée par gène englobant les deux méthodes, pour visualiser plus précisément les variations de MSE suivant alpha:
Heatmap((all_mses_scaled-rowMeans(all_mses_scaled))/matrixStats::rowSds(as.matrix(all_mses_scaled)),
row_km = 0, cluster_columns = F, show_row_names = F,
name = "MSE/Var", top_annotation = hall,
row_title = "Genes")+ rowAnnotation(
scaled_min_mse = min_mses)
En fait, comme le z-score est calculé sur les deux méthodes, on a encore une grosse contribution à la couleur de la différence de MSE inhérente aux deux méthodes, et non à l’effet de l’intégration. Je calcule donc un z-score par méthodes pour faire la représentation:
mse_lasso_scaled_zscore <- (mse_lasso_scaled-rowMeans(mse_lasso_scaled, na.rm = T))/matrixStats::rowSds(as.matrix(mse_lasso_scaled, na.rm=T))
mse_rf_scaled_zscore <- (mse_rf_scaled-rowMeans(mse_rf_scaled))/matrixStats::rowSds(as.matrix(mse_rf_scaled))
col_fun = circlize::colorRamp2(c(-0.8, 0, 0.3), c("blue", "white", "red"))
Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore),
row_km = 0, cluster_columns = F, show_row_names = F,
name = "MSE/Var", top_annotation = hall,
row_title = "Genes")+ rowAnnotation(
scaled_min_mse = min_mses_lasso-min_mses_rf, col = list(scaled_min_mse = col_fun))
Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore),
row_km = 0, cluster_columns = F, show_row_names = F,
name = "MSE/Var", top_annotation = hall,
row_title = "Genes")+ rowAnnotation(
bad_for_lasso = bad_genes_lasso,
bad_for_rf = bad_genes_rf)
c’est informatif : certains gènes bénéficient de l’intégration de données et d’autres non, et ce n’est pas spécialement lié à si on les prédit bien globalement. Par contre celà dépend visiblement de la méthode utilisée et nous n’avons pas encore d’explication pour ça…
Définir des groupes suivant l’optimum comme par exemple:
get_optimum <- function(gene, method = "rf"){
if(method=="rf")
return(as.numeric(names(which.min(mse_rf_scaled_zscore[gene,]))))
if(method=="lasso")
return(as.numeric(names(which.min(mse_lasso_scaled_zscore[gene,]))))
}
assign_cluster <- function(opt){
if(opt == 0)
return("0")
if(opt>0 & opt <=0.4){
return("0.1-0.4")
}
if(opt>0.4 & opt <=0.7){
return("0.5-0.7")
}
if(opt>0.7 & opt <=1){
return("0.8-1")
}
}
clusters_rf <- sapply(sapply(genes, get_optimum, method = "rf"), assign_cluster)
clusters_lasso <- sapply(sapply(genes, get_optimum, method = "lasso"), assign_cluster)
table(clusters_lasso)
## clusters_lasso
## 0 0.1-0.4 0.5-0.7 0.8-1
## 635 474 197 120
table(clusters_rf)
## clusters_rf
## 0 0.1-0.4 0.5-0.7 0.8-1
## 440 496 218 272
Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore),
row_km = 0, cluster_columns = F, show_row_names = F,
name = "MSE/Var", top_annotation = hall,
row_title = "Genes")+ rowAnnotation(
clusters_rf = clusters_rf,
clusters_lasso = clusters_lasso,
col=list(clusters_lasso = setNames(c("red","orange", "yellow", "green"),
nm = names(table(clusters_lasso))),
clusters_rf= setNames(c("red","orange", "yellow", "green"),
nm = names(table(clusters_lasso))) ))
draw_specific_genes <- function(genes){
Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore)[genes,],
row_km = 0, cluster_columns = F, show_row_names = F,
name = "MSE/Var", top_annotation = hall,
row_title = "Genes")+ rowAnnotation(
clusters_rf = clusters_rf[genes],
clusters_lasso = clusters_lasso[genes],
col=list(clusters_lasso = setNames(c("red","orange", "yellow", "green"),
nm = names(table(clusters_lasso))),
clusters_rf= setNames(c("red","orange", "yellow", "green"),
nm = names(table(clusters_lasso))) ))
}
all_pwm_pos <- names(which(clusters_lasso == "0.8-1" & clusters_rf == "0.8-1"))
lasso_bad_rf_good <- names(which(clusters_lasso == "0" & clusters_rf == "0.8-1"))
lasso_good_rf_bad <- names(which(clusters_lasso == "0.8-1" & clusters_rf == "0"))
all_yellow <- names(which(clusters_lasso == "0.5-0.7" & clusters_rf == "0.5-0.7"))
all_orange <- names(which(clusters_lasso == "0.1-0.4" & clusters_rf == "0.1-0.4"))
all_red <- names(which(clusters_lasso == "0" & clusters_rf == "0"))
different <- names(which(clusters_lasso != clusters_rf))
draw_specific_genes(all_pwm_pos)
draw_specific_genes(c(lasso_bad_rf_good, lasso_good_rf_bad))
draw_specific_genes(all_yellow)
draw_specific_genes(all_orange)
draw_specific_genes(all_red)
draw_specific_genes(different)
Nombre de motifs dans le promoteur, niveau d’expression, ontologies, sont-ils des TFs? leur taille?
Quelle est la précision rappel des sous réseaux concernant ces gènes.
load("rdata/pwm_prom_jaspar_dap.rdata")
genes_info <- data.frame(genes = genes,
cluster_lasso = clusters_lasso[genes],
cluster_rf = clusters_rf[genes])
genes_info$is_tf <- genes %in% tfs
genes_info$var <- var_expr
genes_info$expr <- mean_expr
genes_info$nb_motifs <- table(pwm_prom$target)[genes]
genes_info%>%
mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
ggplot(aes(x=cluster_rf, y=nb_motifs,
fill = cluster_rf==cluster_lasso)) +
geom_violin()+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
ggtitle(("Number of motifs in promoter"))
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.
genes_info%>%
mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
ggplot(aes(x=cluster_rf, y=var,
fill = cluster_rf==cluster_lasso)) +
geom_violin()+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
ggtitle(("Gene variance for all groups")) + scale_y_log10()
genes_info%>%
ggplot(aes(x=cluster_rf, y=var)) +
geom_violin()+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene variance for RF groups")) + scale_y_log10()+
stat_compare_means()
genes_info%>%
ggplot(aes(x=cluster_lasso, y=var)) +
geom_violin()+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene variance for LASSO groups")) + scale_y_log10()+
stat_compare_means()
genes_info%>%
mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
ggplot(aes(x=cluster_rf, y=expr,
fill = cluster_rf==cluster_lasso)) +
geom_violin()+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
ggtitle(("Gene expression for all groups")) + scale_y_log10()
genes_info%>%
mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
mutate(agree_on_beneficial_pwms = cluster_lasso == "0.8-1" & cluster_lasso==cluster_rf) %>%
ggplot(aes(x=agree_on_beneficial_pwms, y=expr,
fill = agree_on_beneficial_pwms)) +
geom_violin()+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
xlab("cluster RF") +
ggtitle(("Gene expression : PWM beneficial for both methods vs other groups")) + scale_y_log10()
genes_info%>%
ggplot(aes(x=cluster_rf, y=expr)) +
geom_violin()+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene expression for RF groups")) + scale_y_log10()+
stat_compare_means()
genes_info%>%
ggplot(aes(x=cluster_lasso, y=expr)) +
geom_violin()+geom_jitter(width=0.1)+
geom_boxplot(width=0.1, fill = "white")+
ggtitle(("Gene expression for LASSO groups")) + scale_y_log10()+
stat_compare_means()
clusters_info <- genes_info %>%
group_by(cluster_lasso, cluster_rf) %>%
summarise(n=n(),
tf_frac=sum(is_tf)/n())
## `summarise()` has grouped output by 'cluster_lasso'. You can override using the
## `.groups` argument.
ggplot(clusters_info, aes(x=cluster_rf, y=tf_frac,
label = paste("n=",n),
fill = cluster_rf==cluster_lasso)) +
geom_bar(stat = "identity")+
facet_wrap(~cluster_lasso )+
geom_hline(yintercept = length(tfs)/length(genes)) +
geom_text(y=0.22) + xlab("cluster RF") +
ggtitle("Fraction of TFs in all groups")
genes_info %>%
group_by(cluster_lasso) %>%
summarise(n=n(),
tf_frac=sum(is_tf)/n()) %>%
ggplot(aes(x=cluster_lasso, y=tf_frac,
label = paste("n=",n))) +
geom_bar(stat = "identity")+
geom_hline(yintercept = length(tfs)/length(genes)) +
geom_text(y=0.16) + xlab("cluster RF") +
ggtitle("Fraction of TFs in LASSO groups")
genes_info %>%
group_by(cluster_rf) %>%
summarise(n=n(),
tf_frac=sum(is_tf)/n()) %>%
ggplot(aes(x=cluster_rf, y=tf_frac,
label = paste("n=",n))) +
geom_bar(stat = "identity")+
geom_hline(yintercept = length(tfs)/length(genes)) +
geom_text(y=0.15) + xlab("cluster RF") +
ggtitle("Fraction of TFs in RF groups")
Est-ce qu’on a significativement plus de TFs dans les genes qui sont
bien prédits avec les PWM chez le lasso?
fisher.test(matrix(c(length(genes), length(tfs), 120, 120*0.175),
nrow = 2), alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(length(genes), length(tfs), 120, 120 * 0.175), nrow = 2)
## p-value = 0.2257
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0.7895277 Inf
## sample estimates:
## odds ratio
## 1.241379
load("rdata/networks_bRF_lasso.rdata")
prettyZero <- function(l){
max.decimals = max(nchar(str_extract(l, "\\.[0-9]+")), na.rm = T)-1
lnew = formatC(l, replace.zero = T, zero.print = "0",
digits = max.decimals, format = "f", preserve.width=T)
return(lnew)}
all_pwm_pos
## [1] "AT2G19600" "AT5G65830" "AT2G42280" "AT1G31320" "AT1G22500" "AT2G01670"
## [7] "AT5G50400" "AT1G60960" "AT3G49570" "AT2G28305" "AT4G35320" "AT2G47700"
## [13] "AT2G33020" "AT4G10150" "AT2G24120" "AT3G61830" "AT2G28850" "AT3G02832"
## [19] "AT3G06110" "AT5G16370" "AT2G46790" "AT3G14720" "AT3G63280" "AT2G34170"
## [25] "AT1G69310" "AT3G62690" "AT1G63880" "AT1G80440" "AT2G25090" "AT5G46050"
## [31] "AT1G30080" "AT5G67480" "AT1G76020" "AT4G37400" "AT2G05590" "AT3G50660"
## [37] "AT1G53530"
draw_precision_recall <- function(genes){
subset <- lapply(edges, function(net){net[net$to %in% genes,]})
color_palette = c("#88002D", "#C12131", "#EC5D2F", "#FE945C", "#FFC08E" )
settings <- c("method", "alpha", "density", "rep")
# number of edges per network
nrows <-
data.frame(alpha_rep = names(unlist(lapply(edges, FUN = nrow))),
n_edges = unlist(lapply(subset, FUN = nrow)))
nrows[, settings] <-
str_split_fixed(nrows$alpha_rep, '_', length(settings))
subset$`LASSO-D3S_0_0.005_1`
val_conn <-
evaluate_networks(
subset,
validation = c("TARGET", "CHIPSeq", "DAPSeq"),
nCores = 10,
input_genes = genes,
input_tfs = tfs
)
val_conn[, settings] <-
str_split_fixed(val_conn$network_name, '_', length(settings))
val_connecTF <- val_conn %>%
dplyr::select(-network_name) %>%
mutate(alpha = as.numeric(alpha)) %>%
ggplot(aes(color = density, x = alpha, y = precision)) +
ggh4x::facet_nested_wrap(vars(method), ncol = 8, nest_line = T) + geom_point() +
geom_smooth(aes(fill = density), alpha = 0.1) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'none'
) +
xlab(expression(alpha)) + ylab("Precision") +
ggtitle("Precision against ConnecTF") +
scale_x_continuous(labels = prettyZero) +
scale_color_manual(name = "Network density", values = color_palette) +
scale_fill_manual(name = "Network density", values = color_palette)
val_connecTF
}
for(group in unique(clusters_lasso)){
print(draw_precision_recall(names(which(clusters_lasso == group)))+
ggtitle(paste("LASSO, group", group)))
print(draw_precision_recall(names(which(clusters_rf == group)))+
ggtitle(paste("RF, group", group)))
}
##
## Parallel network validation with AranetBench Using 10 cores.
## 6.026 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##
## Parallel network validation with AranetBench Using 10 cores.
## 7.554 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##
## Parallel network validation with AranetBench Using 10 cores.
## 12.184 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##
## Parallel network validation with AranetBench Using 10 cores.
## 9.503 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##
## Parallel network validation with AranetBench Using 10 cores.
## 10.342 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##
## Parallel network validation with AranetBench Using 10 cores.
## 10.897 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##
## Parallel network validation with AranetBench Using 10 cores.
## 7.642 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
##
## Parallel network validation with AranetBench Using 10 cores.
## 8.258 sec elapsed
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
library(DIANE)
##
## Attaching package: 'DIANE'
## The following object is masked from 'package:igraph':
##
## normalize
background <- rownames(gene_annotations$`Arabidopsis thaliana`)
background <- convert_from_agi(background)
##
for(group in unique(clusters_lasso)){
genes_i <- names(which(clusters_lasso == group))
print(paste("LASSO", length(genes_i), "genes,", group))
genes_i <- convert_from_agi(genes_i)
go_lasso <- enrich_go(genes_i, background)
print(DIANE::draw_enrich_go(go_lasso))
genes_i <- names(which(clusters_rf == group))
print(paste("RF", length(genes_i), "genes, min mse at", group))
genes_i <- convert_from_agi(genes_i)
go_rf <- enrich_go(genes_i, background)
print(DIANE::draw_enrich_go(go_rf))
common_go <- intersect(go_lasso$ID, go_rf$ID)
print(go_lasso[common_go,])
}
## [1] "LASSO 120 genes, 0.8-1"
##
## [1] "RF 272 genes, min mse at 0.8-1"
## [1] ID Description GeneRatio BgRatio pvalue p.adjust
## [7] qvalue geneID Count
## <0 rows> (or 0-length row.names)
## [1] "LASSO 635 genes, 0"
## [1] "RF 440 genes, min mse at 0"
## ID Description GeneRatio
## GO:0022613 GO:0022613 ribonucleoprotein complex biogenesis 33/532
## GO:0042254 GO:0042254 ribosome biogenesis 31/532
## GO:0034470 GO:0034470 ncRNA processing 31/532
## GO:0006364 GO:0006364 rRNA processing 26/532
## GO:0016072 GO:0016072 rRNA metabolic process 26/532
## GO:0042273 GO:0042273 ribosomal large subunit biogenesis 11/532
## GO:0015711 GO:0015711 organic anion transport 11/532
## GO:0015849 GO:0015849 organic acid transport 11/532
## GO:0019318 GO:0019318 hexose metabolic process 8/532
## GO:2001057 GO:2001057 reactive nitrogen species metabolic process 6/532
## GO:0042126 GO:0042126 nitrate metabolic process 5/532
## GO:0042128 GO:0042128 nitrate assimilation 5/532
## GO:0071941 GO:0071941 nitrogen cycle metabolic process 5/532
## GO:0015695 GO:0015695 organic cation transport 3/532
## BgRatio pvalue p.adjust qvalue
## GO:0022613 427/21664 7.407051e-09 2.498028e-06 2.249404e-06
## GO:0042254 349/21664 7.779240e-10 7.432053e-07 6.692356e-07
## GO:0034470 388/21664 9.826013e-09 2.651058e-06 2.387204e-06
## GO:0006364 256/21664 1.101861e-09 7.432053e-07 6.692356e-07
## GO:0016072 268/21664 2.930627e-09 1.317805e-06 1.186647e-06
## GO:0042273 93/21664 1.749132e-05 1.123609e-03 1.011779e-03
## GO:0015711 108/21664 7.113627e-05 3.554179e-03 3.200439e-03
## GO:0015849 115/21664 1.257931e-04 5.107640e-03 4.599287e-03
## GO:0019318 90/21664 1.668753e-03 4.414015e-02 3.974697e-02
## GO:2001057 18/21664 3.078488e-06 3.405256e-04 3.066338e-04
## GO:0042126 13/21664 9.581337e-06 7.180680e-04 6.466002e-04
## GO:0042128 13/21664 9.581337e-06 7.180680e-04 6.466002e-04
## GO:0071941 17/21664 4.246239e-05 2.386740e-03 2.149193e-03
## GO:0015695 11/21664 2.097292e-03 4.963592e-02 4.469576e-02
## geneID
## GO:0022613 MEE49/NA/atENP1/RPL7A/NA/NA/EMB2762/NA/AtGHS40/NA/NA/CP33/AtNug2/ATFAS4/ISE4/NA/WDR55/NA/RBL/NA/NA/NA/NA/AtNMD3/NA/REB1/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/atBRX1-1
## GO:0042254 MEE49/NA/atENP1/RPL7A/NA/NA/EMB2762/NA/AtGHS40/NA/NA/CP33/AtNug2/ATFAS4/NA/WDR55/NA/RBL/NA/NA/NA/AtNMD3/NA/REB1/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/atBRX1-1
## GO:0034470 MEE49/NA/TRZ3/atENP1/RPL7A/NA/NA/EMB2762/AtGHS40/NA/NA/CP33/ATFAS4/NA/WDR55/NA/NA/NA/NA/REB1/NA/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/NA/atBRX1-1/NA/AtRPP30
## GO:0006364 MEE49/NA/atENP1/RPL7A/NA/NA/EMB2762/AtGHS40/NA/NA/CP33/ATFAS4/NA/WDR55/NA/NA/NA/NA/REB1/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/atBRX1-1
## GO:0016072 MEE49/NA/atENP1/RPL7A/NA/NA/EMB2762/AtGHS40/NA/NA/CP33/ATFAS4/NA/WDR55/NA/NA/NA/NA/REB1/ATFIB2/AtPEIP1/NA/RID2/NA/TOZ/atBRX1-1
## GO:0042273 MEE49/NA/RPL7A/NA/NA/RBL/NA/NA/AtPEIP1/NA/atBRX1-1
## GO:0015711 ALMT9/AtmSFC1/APC2/DiT1/ATSDAT/AIT1/ADNT1/ALMT4/AtDTX50/UMAMIT14/AtNiaP
## GO:0015849 ALMT9/AtmSFC1/DiT1/ATSDAT/AIT1/NA/ALMT4/AtDTX50/UMAMIT14/SIAR1/AtNiaP
## GO:0019318 GAPCP-2/G6PD2/PGI/FRK1/AtFBA8/FRK3/REB1/G6PD5
## GO:2001057 AtUPM1/GLT1/ATGSKB6/NLP7/CML23/ATGLN1;1
## GO:0042126 AtUPM1/GLT1/ATGSKB6/NLP7/ATGLN1;1
## GO:0042128 AtUPM1/GLT1/ATGSKB6/NLP7/ATGLN1;1
## GO:0071941 AtUPM1/GLT1/ATGSKB6/NLP7/ATGLN1;1
## GO:0015695 APC2/ADNT1/AtNiaP
## Count
## GO:0022613 33
## GO:0042254 31
## GO:0034470 31
## GO:0006364 26
## GO:0016072 26
## GO:0042273 11
## GO:0015711 11
## GO:0015849 11
## GO:0019318 8
## GO:2001057 6
## GO:0042126 5
## GO:0042128 5
## GO:0071941 5
## GO:0015695 3
## [1] "LASSO 474 genes, 0.1-0.4"
## [1] "RF 496 genes, min mse at 0.1-0.4"
## ID Description GeneRatio BgRatio
## GO:0022613 GO:0022613 ribonucleoprotein complex biogenesis 33/391 427/21664
## GO:0042254 GO:0042254 ribosome biogenesis 30/391 349/21664
## GO:0034470 GO:0034470 ncRNA processing 29/391 388/21664
## GO:0006364 GO:0006364 rRNA processing 25/391 256/21664
## pvalue p.adjust qvalue
## GO:0022613 2.494920e-12 1.550593e-09 1.445741e-09
## GO:0042254 1.758817e-12 1.550593e-09 1.445741e-09
## GO:0034470 1.238270e-10 3.078339e-08 2.870179e-08
## GO:0006364 8.272055e-12 3.427388e-09 3.195626e-09
## geneID
## GO:0022613 NA/ARPF2/AtNOB1/NA/NA/EDA13/NA/APUM23/NA/RH10/AtNLE/DIM1A/NA/UGE3/ATPWP2/AtTRM7b/NA/NA/DG238/RH17/PCP1/SmD1a/AtERG2/AtRH36/NA/NA/RH29/NA/NA/IMP4/NA/EIF2/NOF1
## GO:0042254 NA/ARPF2/AtNOB1/NA/NA/EDA13/NA/APUM23/NA/RH10/AtNLE/DIM1A/NA/UGE3/ATPWP2/AtTRM7b/NA/NA/DG238/RH17/PCP1/AtERG2/AtRH36/NA/RH29/NA/NA/IMP4/NA/NOF1
## GO:0034470 ATIPT3/NA/ARPF2/AtNOB1/NA/NA/EDA13/NA/APUM23/NA/RH10/DIM1A/UGE3/ATPWP2/AtTRM1b/AtTRM7b/NA/NA/PCP1/AtTRM11/AtTRM8a/AtRH36/NA/RH29/NA/NA/IMP4/NA/NOF1
## GO:0006364 NA/ARPF2/AtNOB1/NA/NA/EDA13/NA/APUM23/NA/RH10/DIM1A/UGE3/ATPWP2/AtTRM7b/NA/NA/PCP1/AtRH36/NA/RH29/NA/NA/IMP4/NA/NOF1
## Count
## GO:0022613 33
## GO:0042254 30
## GO:0034470 29
## GO:0006364 25
## [1] "LASSO 197 genes, 0.5-0.7"
## [1] "RF 218 genes, min mse at 0.5-0.7"
## ID Description GeneRatio BgRatio pvalue
## GO:0042254 GO:0042254 ribosome biogenesis 11/170 349/21664 1.024780e-04
## GO:0006364 GO:0006364 rRNA processing 10/170 256/21664 3.612792e-05
## GO:0016072 GO:0016072 rRNA metabolic process 10/170 268/21664 5.313390e-05
## p.adjust qvalue
## GO:0042254 0.010644906 0.009991608
## GO:0006364 0.008878772 0.008333866
## GO:0016072 0.008878772 0.008333866
## geneID Count
## GO:0042254 NA/NA/PCP2/NA/EBP2/ATRRP4/RNC4/NA/NA/ARRS1/NA 11
## GO:0006364 NA/PCP2/NA/EBP2/ATRRP4/RNC4/NA/NA/ARRS1/NA 10
## GO:0016072 NA/PCP2/NA/EBP2/ATRRP4/RNC4/NA/NA/ARRS1/NA 10